---
title: "Recipes for some analyses modifications"
subtitle: "Where we present some piece of codes to modify the bioinformatic pipeline"
author: Adrien Taudière
date: last-modified
execute:
eval: false
format:
html:
code-fold: false
# bibliography: bibliography.bib
# link-citations: true
---
See also [ Easy16S ](https://github.com/YongxinLiu/EasyAmplicon/blob/master/pipeline_en.sh) for inspiration in bash.
## Change the parameter to merged forward and reverse sequences
1. Play with option min Overlap (default to 12) and maxMismach (default to 0). For example, replace the `merged_seq`
targets with the following code to allow 1 mismatch and a minimum overlap of 8.
```{r, eval=FALSE}
tar_target(
merged_seq,
mergePairs(
dadaF = ddF,
dadaR = ddR,
derepF = derep_fs,
derepR = derep_rs,
minOverlap = 8,
maxMismatch = 1)
),
format = "qs"
)
)
```
## Forward only pipeline
1. Replace all the "paired end" area with the following code
```{r, eval=FALSE}
##> Remove primers
tar_target(
cutadapt,
cutadapt_remove_primers(
path_to_fastq = here("data/data_raw/rawseq/"),
primer_fw = fw_primer_sequences,
folder_output = here("data/data_intermediate/seq_wo_primers/"),
args_before_cutadapt = "source ~/miniforge3/etc/profile.d/conda.sh && conda activate cutadaptenv && "
)
),
tar_target(data_raw, {
cutadapt
list_fastq_files(path = here::here("data/data_intermediate/seq_wo_primers/"),
paired_end = FALSE)
}),
##> Classical dada2 pipeline
tar_target(data_fnfs, data_raw$fnfs),
### Pre-filtered data with low stringency
tar_target(
filtered,
filter_trim(
output_fw = paste(
getwd(),
here("/data/data_intermediate/filterAndTrim_fwd"),
sep = ""
),
rev = data_fnrs,
multithread = n_threads,
compress = TRUE
)
),
### Dereplicate fastq files
tar_target(derep_fs, derepFastq(filtered[[1]]), format = "qs"),
tar_target(derep_rs, derepFastq(filtered[[2]]), format = "qs"),
### Learns the error rates
tar_target(err_fs, learnErrors(derep_fs, multithread = 4), format = "qs"),
tar_target(err_rs, learnErrors(derep_rs, multithread = 4), format = "qs"),
### Make amplicon sequence variants
tar_target(ddF, dada(derep_fs, err_fs, multithread = 4), format = "qs"),
tar_target(ddR, dada(derep_rs, err_rs, multithread = 4), format = "qs"),
### Build a a table of ASV x Samples
tar_target(seq_tab, makeSequenceTable(ddF)),
```
## Add a second taxonomic assignation using a different database or algorithm
1. Add a new database file (*fasta*) in `data/data_raw/refseq`
1. Copy and complete with good names the two targets below
1. Rename the targets by using the new name (e.g. `data_phyloseq_newDB` ) instead of `data_phyloseq` in the subsequent targets
```{r, eval=FALSE}
[...]
tar_target(
name = file_refseq_taxo2,
command = "data/data_raw/refseq/XXX",
format = "file"
)
[...]
tar_target(
data_phyloseq_newDB,
add_new_taxonomy_pq(
data_phyloseq,
file_refseq_taxo2,
suffix = "PR2",
taxLevels = c(
"Kingdom",
"Supergroup",
"Division",
"Subdivision",
"Class",
"Order",
"Family",
"Genus",
"Species"
)
)
)
```
## Filter by % sequence (whith blast)
```{r, eval=FALSE}
tar_target(d_blast,
filter_taxa_blast(
data_phyloseq,
fasta_for_db = paste0(here::here(), "/", file_refseq_taxo),
nproc = 4
)
)
```
## Add funguild informations for Fungi
```{r, eval=FALSE}
tar_target(d_funguild,
MiscMetabar::add_funguild_info(data_phyloseq)
)
```
## Add Protax informations for Bacteria
::: {.content-hidden}
## FAPROTAX analysis
### Assign functionnality
```{r}
# Inspiration from https://forum.qiime2.org/t/exporting-otu-table-from-phyloseq-into-either-biom-or-text-format/19103/7
write_biom_csv <- function (physeq, file, sep = "; " ) {
ps <- phyloseq:: taxa_sums (physeq) |>
as.data.frame ()
ps |> tibble:: rownames_to_column ("#OTU ID" ) |>
left_join (phyloseq:: tax_table (physeq) |>
as.data.frame () |>
tibble:: rownames_to_column ("#OTU ID" ) |>
tidyr:: unite ("taxonomy" , ! ` #OTU ID ` , sep = sep)) -> phyloseq_biom
write.table (phyloseq_biom, file = file, sep = " \t " , quote = FALSE , row.names = FALSE )
}
```
```{r}
here:: i_am ("report_16S.qmd" )
tax_tab_path <- here:: here ("output/faprotax_tax_table.csv" )
FAPROTAX_path <- here:: here ("code/FAPROTAX_1.2.7" )
d_vs@ tax_table <- tax_table (cbind (d_vs@ tax_table, "Genus_species" = paste (d_vs@ tax_table[,"Genus" ],
d_vs@ tax_table[,"Species" ]) ))
write.table (subset_taxa (d_vs, ! is.na (Species))@ tax_table, tax_tab_path, sep = ";" )
write_biom_csv (d_vs, tax_tab_path)
dir.create ("output/FAPROTAX" )
# system(pip install biom-format)
system (paste0 ("python3 " , FAPROTAX_path, "/collapse_table.py" , " -i " , tax_tab_path, " -o output/FAPROTAX/func_table.csv -g " , FAPROTAX_path, '/FAPROTAX.txt -r output/FAPROTAX/report.txt -s output/FAPROTAX/output_subtables/ -f -c "#" -d "taxonomy" --omit_columns 0 --column_names_are_in last_comment_line -n columns_after_collapsing -v --out_groups2records_table output/FAPROTAX/faprotax_result_OTU.tsv' ))
func_tab <- read.table ("output/FAPROTAX/faprotax_result_OTU.tsv" , sep= " \t " , header = TRUE )
colnames (func_tab) <- gsub ("FPTax_record" , "record_faprotax" , paste0 ("FPTax_" , colnames (func_tab)))
tax_tab <-
as.matrix (cbind (
tax_table (d_vs),
func_tab
))
d_vs@ tax_table <- tax_table (tax_tab)
```
### Functionnal analysis
```{r}
func_group <- colSums (apply (d_vs@ tax_table[,grepl ("FPTax" ,colnames (d_vs@ tax_table))], 2 , as.numeric))
func_group <- func_group[func_group> 0 ]
DT:: datatable (as.data.frame (sort (func_group, decreasing = T)))
func_group_more_than20 <- func_group[func_group> 20 ]
d_vs_Treatment <- speedyseq:: merge_samples2 (d_vs, "Treatment" )
```
```{r}
psm <- psmelt (d_vs) |>
filter (Abundance > 0 ) |>
mutate (across (all_of (starts_with ("FPTax_" )), as.numeric))
psm |>
group_by (Sample) |>
summarise ("sumFPTax_chemoheterotrophy" = sum (FPTax_chemoheterotrophy))
ggplot (psm, aes (x = Treatment, y = FPTax_chemoheterotrophy, fill = Sample)) +
geom_bar (stat = "identity" )
ggplot (psm, aes (x = Treatment, y = FPTax_chemoheterotrophy)) + geom_boxplot ()
psm2 <- psm |>
group_by (Sample) |>
summarise (across (all_of (starts_with ("FPTax_" )), sum)) |>
tidyr:: pivot_longer (all_of (starts_with ("FPTax_" )),
names_to = "Function" ,
values_to = "Nb_ASV" )
psm2_na <- psm |>
group_by (Sample) |>
summarise (across (all_of (starts_with ("FPTax_" )), function (x) {
sum (x == 0 )
})) |>
tidyr:: pivot_longer (all_of (starts_with ("FPTax_" )),
names_to = "Function" ,
values_to = "Nb_ASV_NA" )
psm2_nbseq <- psm |>
group_by (Sample) |>
summarise (across (all_of (starts_with ("FPTax_" )), function (x) {
sum (x * Abundance)
})) |>
tidyr:: pivot_longer (all_of (starts_with ("FPTax_" )),
names_to = "Function" ,
values_to = "Nb_seq" )
psm3 <- psm2 |>
filter (Function %in% names (func_group_more_than20))
psm3_na <- psm2_na |>
filter (Function %in% names (func_group_more_than20))
psm3_nbseq <- psm2_nbseq |>
filter (Function %in% names (func_group_more_than20))
psm_4 <- cbind (psm3,
"Nb_ASV_NA" = psm3_na$ Nb_ASV_NA,
"Nb_seq" = psm3_nbseq$ Nb_seq)
psm_4 <- psm_4 |>
mutate ("Prop_ASV" = Nb_ASV/ (Nb_ASV + Nb_ASV_NA))
psm_4$ Treatment <- d_vs@ sam_data$ Treatment[match (psm_4$ Sample, sample_names (d_vs))]
ggplot (psm_4, aes (y = Function, x = Nb_ASV, fill= Treatment)) +
geom_boxplot ()
ggplot (psm_4, aes (y = Function, x = Nb_seq, fill= Treatment)) +
geom_boxplot ()
ggstatsplot:: ggbetweenstats (psm_4 |> filter (Function == "FPTax_chemoheterotrophy" ),
Treatment, Nb_ASV)
```
```{r}
psm_treat <- psmelt (d_vs_Treatment) |>
filter (Abundance > 0 ) |>
mutate (across (all_of (starts_with ("FPTax_" )), as.numeric))
psm_treat2 <- psm_treat |>
group_by (Treatment) |>
summarise (across (all_of (starts_with ("FPTax_" )), sum)) |>
group_by (Treatment) |>
tidyr:: pivot_longer (all_of (starts_with ("FPTax_" )),
names_to = "Function" ,
values_to = "Nb_ASV" )
psm_treat2_na <- psm_treat |>
group_by (Treatment) |>
summarise (across (all_of (starts_with ("FPTax_" )), function (x){
sum (x == 0 )
})) |>
group_by (Treatment) |>
tidyr:: pivot_longer (all_of (starts_with ("FPTax_" )),
names_to = "Function" ,
values_to = "Nb_ASV_NA" )
psm_treat2_seq <- psm_treat |>
group_by (Treatment) |>
summarise (across (all_of (starts_with ("FPTax_" )), function (x){
sum (x* Abundance)
})) |>
group_by (Treatment) |>
tidyr:: pivot_longer (all_of (starts_with ("FPTax_" )),
names_to = "Function" ,
values_to = "Nb_seq" )
psm_treat3 <- psm_treat2 |>
filter (Function %in% names (func_group_more_than20))
psm_treat3_na <- psm_treat2_na |>
filter (Function %in% names (func_group_more_than20))
psm_treat3_seq <- psm_treat2_seq |>
filter (Function %in% names (func_group_more_than20))
psm_treat4 <- cbind (psm_treat3,
"Nb_ASV_NA" = psm_treat3_na$ Nb_ASV_NA,
"Nb_seq" = psm_treat3_seq$ Nb_seq)
psm_treat4 <- psm_treat4 |>
mutate ("Prop_ASV" = Nb_ASV/ (Nb_ASV + Nb_ASV_NA))
ggplot (psm_treat4, aes (y = Function, x = Nb_ASV, fill = Treatment)) +
geom_bar (stat = "identity" , position = "Fill" )
ggplot (psm_treat4, aes (y = Function, x = Nb_seq, fill = Treatment)) +
geom_bar (stat = "identity" , position = "Fill" )
ggplot (psm_treat4, aes (y = Treatment, x = Prop_ASV, fill = Treatment)) +
geom_bar (stat = "identity" ) + facet_wrap (~ Function)
```
:::